# Если у вас не установлены какие-то из библиотек ниже, то установить их можно следующей командой. Эти библиотеки пригодятся нам во время занятия, но подгружать в library() мы их будем тогда, когда будем затрагивать соответствующую тему.
#install.packages(c('ggpubr', 'plotly', 'corrplot', 'corrr', 'ggfortify', 'pheatmap', 'factoextra', 'FactoMineR', 'ggbiplot'))
#install.packages("pheatmap")
# Загрузим библиотеки
library(dplyr)##
## Присоединяю пакет: 'dplyr'
## Следующие объекты скрыты от 'package:stats':
##
## filter, lag
## Следующие объекты скрыты от 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(ggpubr)
insc <- read.csv('data/insurance_cost.csv')Раскраска - по колонке smoker.
Загрузим библиотеку:
# install.packages("plotly")
# или
# devtools::install_github("ropensci/plotly")
library(plotly)##
## Присоединяю пакет: 'plotly'
## Следующий объект скрыт от 'package:ggplot2':
##
## last_plot
## Следующий объект скрыт от 'package:stats':
##
## filter
## Следующий объект скрыт от 'package:graphics':
##
## layout
skimr::skim(insc)| Name | insc |
| Number of rows | 1338 |
| Number of columns | 7 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| sex | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
| smoker | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| region | 0 | 1 | 9 | 9 | 0 | 4 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1 | 39.21 | 14.05 | 18.00 | 27.00 | 39.00 | 51.00 | 64.00 | ▇▅▅▆▆ |
| bmi | 0 | 1 | 30.66 | 6.10 | 15.96 | 26.30 | 30.40 | 34.69 | 53.13 | ▂▇▇▂▁ |
| children | 0 | 1 | 1.09 | 1.21 | 0.00 | 0.00 | 1.00 | 2.00 | 5.00 | ▇▂▂▁▁ |
| charges | 0 | 1 | 13270.42 | 12110.01 | 1121.87 | 4740.29 | 9382.03 | 16639.91 | 63770.43 | ▇▂▁▁▁ |
plot_ly(
data = insc,
x = ~ charges,
y = ~ bmi,
color = ~smoker
) %>%
layout(
title = 'Отношение индекса массы тела и расходов на страховку',
yaxis = list(title = 'ИМТ',
zeroline = FALSE), # Уберём выделения нулевых осей по y
xaxis = list(title = 'Расходы на страховку',
zeroline = FALSE)) # Уберём выделения нулевых осей по x## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
Раскраска - по колонке smoker (с помощью ggplot).
Загрузим библиотеку:
library(ggplot2)
library(magrittr)insc %>%
#filter(mass != 0 & triceps != 0) %>%
ggplot(aes(x = charges, y = bmi, color = smoker)) +
geom_point(size=3) +
theme_minimal() +
ggtitle('Отношение индекса массы тела и расходов на страховку') +
theme(axis.text.x = element_text(size = 14)) library(corrplot)## corrplot 0.92 loaded
Получаем объект матрицы:
# Для более "чистого" результата, избавляемся от ошибочных значений
insc_num <- insc %>%
#filter(glucose != 0 & pressure != 0 & triceps != 0 & insulin != 0 & mass != 0 & age != 0 ) %>%
select(is.integer | is.numeric) # Обратите внимание, в dplyr можно задавать выборку колонок через команды определения формата данных## Warning: Use of bare predicate functions was deprecated in tidyselect 1.1.0.
## ℹ Please use wrap predicates in `where()` instead.
## # Was:
## data %>% select(is.integer)
##
## # Now:
## data %>% select(where(is.integer))
## Warning: Use of bare predicate functions was deprecated in tidyselect 1.1.0.
## ℹ Please use wrap predicates in `where()` instead.
## # Was:
## data %>% select(is.numeric)
##
## # Now:
## data %>% select(where(is.numeric))
head(insc_num)# Получаем непосредственно матрицу
insc_cor <- cor(insc_num)
insc_cor## age children bmi charges
## age 1.0000000 0.04246900 0.1092719 0.29900819
## children 0.0424690 1.00000000 0.0127589 0.06799823
## bmi 0.1092719 0.01275890 1.0000000 0.19834097
## charges 0.2990082 0.06799823 0.1983410 1.00000000
Визуализируем её в corplot:
corrplot(insc_cor, method = 'number')library(corrr)insc_cor %>%
network_plot(min_cor = .0)#install.packages("RColorBrewer")
library('RColorBrewer')
COL1(sequential = c("Oranges", "Purples", "Reds", "Blues", "Greens",
"Greys", "OrRd", "YlOrRd", "YlOrBr", "YlGn"), n = 200)## [1] "#FFF5EB" "#FEF4E9" "#FEF3E8" "#FEF3E7" "#FEF2E6" "#FEF1E5" "#FEF1E4"
## [8] "#FEF0E2" "#FEF0E1" "#FEEFE0" "#FEEEDF" "#FEEEDE" "#FEEDDD" "#FEEDDB"
## [15] "#FEECDA" "#FEEBD9" "#FEEBD8" "#FEEAD7" "#FEEAD6" "#FEE9D4" "#FEE8D3"
## [22] "#FEE8D2" "#FEE7D1" "#FEE7D0" "#FEE6CF" "#FDE5CD" "#FDE5CC" "#FDE4CA"
## [29] "#FDE3C8" "#FDE2C6" "#FDE1C4" "#FDE0C3" "#FDDFC1" "#FDDEBF" "#FDDDBD"
## [36] "#FDDDBC" "#FDDCBA" "#FDDBB8" "#FDDAB6" "#FDD9B5" "#FDD8B3" "#FDD7B1"
## [43] "#FDD6AF" "#FDD5AD" "#FDD5AC" "#FDD4AA" "#FDD3A8" "#FDD2A6" "#FDD1A5"
## [50] "#FDD0A3" "#FDCFA1" "#FDCE9F" "#FDCC9D" "#FDCB9A" "#FDCA98" "#FDC896"
## [57] "#FDC794" "#FDC691" "#FDC48F" "#FDC38D" "#FDC18B" "#FDC089" "#FDBF86"
## [64] "#FDBD84" "#FDBC82" "#FDBB80" "#FDB97E" "#FDB87B" "#FDB779" "#FDB577"
## [71] "#FDB475" "#FDB273" "#FDB170" "#FDB06E" "#FDAE6C" "#FDAD6A" "#FDAC68"
## [78] "#FDAA66" "#FDA964" "#FDA862" "#FDA660" "#FDA55E" "#FDA45D" "#FDA25B"
## [85] "#FDA159" "#FDA057" "#FD9E55" "#FD9D53" "#FD9C51" "#FD9A4F" "#FD994D"
## [92] "#FD984C" "#FD964A" "#FD9548" "#FD9446" "#FD9244" "#FD9142" "#FD9040"
## [99] "#FD8E3E" "#FD8D3C" "#FC8C3B" "#FC8A39" "#FB8937" "#FB8736" "#FA8634"
## [106] "#FA8532" "#F98331" "#F9822F" "#F8802D" "#F87F2C" "#F77D2A" "#F77C29"
## [113] "#F67A27" "#F67925" "#F67824" "#F57622" "#F57520" "#F4731F" "#F4721D"
## [120] "#F3701B" "#F36F1A" "#F26D18" "#F26C16" "#F16A15" "#F16913" "#F06812"
## [127] "#EF6611" "#EE6511" "#ED6410" "#EC620F" "#EB610E" "#EA600E" "#E95E0D"
## [134] "#E85D0C" "#E75C0C" "#E65A0B" "#E5590A" "#E45809" "#E35609" "#E25508"
## [141] "#E15407" "#E05206" "#DF5106" "#DF5005" "#DE4E04" "#DD4D04" "#DC4C03"
## [148] "#DB4A02" "#DA4901" "#D94801" "#D74701" "#D54601" "#D34601" "#D14501"
## [155] "#CF4401" "#CD4301" "#CB4301" "#C94201" "#C74101" "#C54001" "#C24001"
## [162] "#C03F01" "#BE3E02" "#BC3E02" "#BA3D02" "#B83C02" "#B63B02" "#B43B02"
## [169] "#B23A02" "#B03902" "#AE3802" "#AC3802" "#AA3702" "#A83602" "#A63602"
## [176] "#A43503" "#A33403" "#A13403" "#9F3303" "#9E3303" "#9C3203" "#9B3103"
## [183] "#993103" "#983003" "#963003" "#942F03" "#932E03" "#912E03" "#902D03"
## [190] "#8E2D03" "#8D2C03" "#8B2B03" "#892B03" "#882A03" "#862A03" "#852903"
## [197] "#832803" "#822803" "#802703" "#7F2704"
COL2(diverging = c("RdBu", "BrBG", "PiYG", "PRGn", "PuOr", "RdYlBu"), n = 200)## [1] "#67001F" "#6A011F" "#6E0220" "#720320" "#760421" "#790622" "#7D0722"
## [8] "#810823" "#850923" "#880A24" "#8C0C25" "#900D25" "#940E26" "#970F26"
## [15] "#9B1027" "#9F1228" "#A31328" "#A71429" "#AA1529" "#AE162A" "#B2182B"
## [22] "#B31B2C" "#B51F2E" "#B72330" "#B92632" "#BB2A33" "#BD2E35" "#BE3137"
## [29] "#C03538" "#C2383A" "#C43C3C" "#C6403D" "#C7433F" "#C94741" "#CB4B43"
## [36] "#CD4E44" "#CF5246" "#D05548" "#D25949" "#D45D4B" "#D6604D" "#D76450"
## [43] "#D96752" "#DA6B55" "#DC6E58" "#DD725A" "#DF755D" "#E07860" "#E27C62"
## [50] "#E37F65" "#E58368" "#E6866A" "#E88A6D" "#E98D70" "#EB9172" "#EC9475"
## [57] "#EE9878" "#EF9B7A" "#F19F7D" "#F2A280" "#F4A583" "#F4A886" "#F5AB89"
## [64] "#F5AD8D" "#F5B090" "#F6B394" "#F6B697" "#F7B89B" "#F7BB9E" "#F8BEA2"
## [71] "#F8C0A5" "#F9C3A9" "#F9C6AC" "#FAC9B0" "#FACBB3" "#FACEB7" "#FBD1BA"
## [78] "#FBD3BD" "#FCD6C1" "#FCD9C4" "#FDDBC8" "#FDDDCA" "#FDDFCD" "#FDE1D0"
## [85] "#FDE2D3" "#FDE4D6" "#FDE6D9" "#FDE8DB" "#FDEADE" "#FDECE1" "#FEEDE4"
## [92] "#FEEFE7" "#FEF1E9" "#FEF3EC" "#FEF5EF" "#FEF6F2" "#FEF8F5" "#FEFAF7"
## [99] "#FEFCFA" "#FEFEFD" "#FDFEFE" "#FBFDFD" "#F9FBFD" "#F6FAFC" "#F4F9FB"
## [106] "#F2F7FA" "#EFF6FA" "#EDF5F9" "#EBF3F8" "#E9F2F7" "#E6F1F7" "#E4EFF6"
## [113] "#E2EEF5" "#DFEDF4" "#DDECF4" "#DBEAF3" "#D8E9F2" "#D6E8F1" "#D4E6F1"
## [120] "#D1E5F0" "#CFE4EF" "#CBE2EE" "#C8E0ED" "#C5DFEC" "#C2DDEB" "#BFDBEA"
## [127] "#BCDAEA" "#B8D8E9" "#B5D7E8" "#B2D5E7" "#AFD3E6" "#ACD2E5" "#A9D0E4"
## [134] "#A5CFE3" "#A2CDE2" "#9FCBE1" "#9CCAE0" "#99C8E0" "#96C7DF" "#92C5DE"
## [141] "#8FC3DD" "#8BC0DB" "#87BEDA" "#83BBD8" "#7FB9D7" "#7BB6D6" "#77B4D4"
## [148] "#73B1D3" "#6FAFD2" "#6BACD0" "#67AACF" "#63A7CE" "#5FA5CC" "#5BA2CB"
## [155] "#57A0CA" "#539DC8" "#4F9BC7" "#4B98C5" "#4796C4" "#4393C3" "#4191C2"
## [162] "#3F8EC0" "#3E8CBF" "#3C8ABE" "#3A88BD" "#3985BC" "#3783BB" "#3581B9"
## [169] "#337FB8" "#327CB7" "#307AB6" "#2E78B5" "#2D76B4" "#2B73B3" "#2971B1"
## [176] "#286FB0" "#266DAF" "#246AAE" "#2268AD" "#2166AC" "#1F63A8" "#1E60A4"
## [183] "#1C5EA1" "#1B5B9D" "#1A5899" "#185595" "#175391" "#15508E" "#144D8A"
## [190] "#134B86" "#114882" "#10457F" "#0E427B" "#0D4077" "#0C3D73" "#0A3A70"
## [197] "#09386C" "#073568" "#063264" "#053061"
library('corrplot')
corrplot(insc_cor, order = 'AOE', col = COL2('RdBu', 10))corrplot(insc_cor, order = 'AOE', cl.pos = 'b', tl.pos = 'd',
col = COL2('PRGn'), diag = FALSE)Превратим все номинативные переменные в бинарные/дамми. Т.е. sex и smoker сделаем бинарными (1/0), а каждое уникальное значение region – отдельной колонкой, где 1 говорит о наличии этого признака для наблюдения, а 0 – об отсутствии.
#install.packages('fastDummies')
library('fastDummies')insc2 <- dummy_cols(insc, select_columns = c('sex','smoker','region'))Создим новый датафрейм, где оставим только нумерические переменные.
insc2_num <- insc2 %>%
select(is.integer | is.numeric) # Обратите внимание, в dplyr можно задавать выборку колонок через команды определения формата данныхСтандартизируем значения:
insc2_num.sc <- scale(insc2_num)Создаем матрицу дистанций:
rownames(insc2_num.sc) <- insc2_num.sc[0,]
insc2_dist <- dist(as.matrix(insc2_num.sc), method = "euclidean")
as.matrix(insc2_dist)[1:6, 1:6]## 1 2 3 4 5 6
## 1 0.000000 5.825239 6.253322 5.747217 5.759522 4.978144
## 2 5.825239 0.000000 1.823634 4.289327 3.582563 3.361726
## 3 6.253322 1.823634 0.000000 4.663256 4.148789 3.956548
## 4 5.747217 4.289327 4.663256 0.000000 1.807952 4.583438
## 5 5.759522 3.582563 4.148789 1.807952 0.000000 4.329507
## 6 4.978144 3.361726 3.956548 4.583438 4.329507 0.000000
Высчитываем дендрограмму кластеров:
insc2_hc <- hclust(d = insc2_dist, method = "ward.D2")
insc2_gower <- cluster::daisy(as.matrix(insc2_dist), metric = "gower")Визуализируем:
library('factoextra')## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_dend(insc2_hc, cex = 0.6, label_cols = rownames(insc2_num)) # cex() - размер лейблов## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <]8;;https://github.com/kassambara/factoextra/issueshttps://github.com/kassambara/factoextra/issues]8;;>.
insc2_hc_k5 <- cutree(insc2_hc,
k = 5) # Создаём вектор принадлежности к кластерам
head(insc2_hc_k5, n = 4)## [1] 1 2 2 3
table(insc2_hc_k5)## insc2_hc_k5
## 1 2 3 4 5
## 274 273 267 257 267
Визуализируем:
fviz_dend(insc2_hc,
k = 5, # Задаём число кластеров
cex = 0.5, # Задаем размер лейблов
k_colors = c("#2E9FDF", "#E7B800", "#FC4E07", "#DEB887", "#5758BB"),
color_labels_by_k = TRUE, # Соотнести цвета с кластерами
rect = TRUE, # Добавить "квадратик" вокруг групп
)fviz_cluster(list(data = insc2_dist, cluster = insc2_hc_k5, labels = rownames(insc2_num)),
palette = c("#2E9FDF", "#E7B800", "#FC4E07", "#DEB887", "#5758BB"),
ellipse.type = "convex", # Объединить кластеры элипсом
repel = TRUE, # Избежать наслоения лейблов
show.clust.cent = FALSE, # Показывать центр кластера
ggtheme = theme_minimal()
)## Warning: ggrepel: 1331 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
library('pheatmap')
pheatmap(insc2_num.sc)insc2_num.pca <- prcomp(insc2_num, center = TRUE, scale. = TRUE)
summary(insc2_num.pca)## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.6737 1.4023 1.2417 1.1513 1.1498 1.07055 0.98583
## Proportion of Variance 0.2334 0.1639 0.1285 0.1105 0.1102 0.09551 0.08099
## Cumulative Proportion 0.2334 0.3973 0.5258 0.6363 0.7465 0.84196 0.92295
## PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.87032 0.40877 3.048e-15 1.69e-15 5.372e-16
## Proportion of Variance 0.06312 0.01392 0.000e+00 0.00e+00 0.000e+00
## Cumulative Proportion 0.98608 1.00000 1.000e+00 1.00e+00 1.000e+00
#ggbiplot()
#insc2_num_pc <- mutate(pc1 = region_)factoextra::fviz_pca_var(insc2_num.pca, col.var = "contrib")Вывод:
- Уровень объясненной дисперсии по первым двум компонентам не превышает
70%.
- Наибольший вклад в аназируемые главные компоненты вносят:
sex_female, sex_male, smoker_no,
smoker_yes.
- Закономерно, что парные переменные (sex_female и
sex_male, smoker_no и smoker_yes)
отрицательно скоррелированы внутри представленных главных
компонент.
- Переменные: age, region_* вносят наименьший
вклад в аназируемые главные компоненты, так как длина их стрелочек
минимальна по длине.